其他
Rsamtools 批量处理 bam 文件
没关注?伸出手指点这里---
1引言
之前推文对于 Ribo-seq 比对的 bam 文件都是示例单个样本操作的,往往情况下我们会有多个样本, 那么如何批量读入 bam 文件,批量绘图呢?
这里分享一些批量读取 bam 文件和批量绘图的方法,核心是学会 lapply 函数就行了
。
2分析
这里有两个 bam 文件作为示例:
library(Rsamtools)
library(tidyverse)
library(ggplot2)
library(stringr)
library(ggsci)
library(patchwork)
# bam file
bam <- c('ribo-EB.bam','ribo-ESC.bam')
# sample name
name <- c('ribo-EB','ribo-ESC')
# read bam files
bamfile <- lapply(bam,BamFile)
bamfile
# [[1]]
# class: BamFile
# path: ribo-EB.bam
# index: NA
# isOpen: FALSE
# yieldSize: NA
# obeyQname: FALSE
# asMates: FALSE
# qnamePrefixEnd: NA
# qnameSuffixStart: NA
#
# [[2]]
# class: BamFile
# path: ribo-ESC.bam
# index: NA
# isOpen: FALSE
# yieldSize: NA
# obeyQname: FALSE
# asMates: FALSE
# qnamePrefixEnd: NA
# qnameSuffixStart: NA
# scan bamfiles/take some time dependent on your file size
aln <- lapply(bamfile,scanBam)
# check
length(aln)
# [1] 2
可以看到读入的两个文件以 list 格式储存到 aln 对象里了。
read length
绘制两个样本的 read 长度分布,数据处理:
# read length
lapply(1:length(aln),function(x){
len <- aln[[x]][[1]]$qwidth %>% table %>% data.frame()
colnames(len)[1] <- 'pos'
# assign sample name
len$type <- name[x]
return(len)
}) %>% Reduce('rbind',.) %>% data.frame() -> plen
绘图:
# plot
plength <- ggplot(plen,aes(x = pos,y = Freq)) +
geom_col(fill = 'grey40') +
theme_bw(base_size = 16) +
scale_y_continuous(labels = scales::label_number(),) +
theme(axis.ticks.length = unit(0.25,'cm'),
aspect.ratio = 0.5,
axis.title = element_text(size = 18)) +
xlab('Footprint length (nt)') + ylab('Reads (Fraction)') +
facet_wrap(~type,scales = 'free_y',ncol = 2)
plength
frame distribution
数据分析:
# aggsign frame
lapply(1:length(aln), function(x){
# 统计 frame 比例
df <- data.frame(aln[[x]][[1]]$rname,aln[[x]][[1]]$pos,aln[[x]][[1]]$qwidth) %>% na.omit()
colnames(df) <- c('rname','pos','qwidth')
# 提取start codon位置
df$st_pos = sapply(str_split(as.character(df$rname),'_'),'[',4) %>% as.numeric()
# 提取stop codon位置
df$sp_pos = sapply(str_split(as.character(df$rname),'_'),'[',5) %>% as.numeric()
df$st_pos = df$st_pos + 1
# 计算total reads frame类型
df$yushu <- abs(df$pos-df$st_pos) %% 3
# assign frame type
df$frame <- case_when(
df$yushu == 0 ~ "Frame 0",
df$yushu == 1 ~ "Frame 1",
df$yushu == 2 ~ "Frame 2")
df$type <- name[x]
return(df)
}) %>% Reduce('rbind',.) %>% data.frame() -> all_df
# check
head(all_df,3)
# rname pos qwidth st_pos sp_pos yushu
# 1 Chmp1a_ENSMUSG00000000743_ENSMUST00000000759_88_676_2140 8 29 89 676 0
# 2 Chmp1a_ENSMUSG00000000743_ENSMUST00000000759_88_676_2140 14 30 89 676 0
# 3 Chmp1a_ENSMUSG00000000743_ENSMUST00000000759_88_676_2140 29 30 89 676 0
# frame type
# 1 Frame 0 ribo-EB
# 2 Frame 0 ribo-EB
# 3 Frame 0 ribo-EB
统计:
# 按read长度统计frame数量
len.frame <- all_df %>% filter(qwidth %in% c(25:35))
# 统计
len.frame.sum <- len.frame %>% group_by(type,qwidth,frame) %>%
summarise(count = n())
len.frame.sum$per <- len.frame.sum$count/sum(len.frame.sum$count)
绘图:
# bar plot
plengthframe <- ggplot(len.frame.sum,
aes(x = factor(qwidth),y = per,fill = factor(frame))) +
geom_col(show.legend = T,width = 0.75,position = position_dodge2()) +
theme_bw(base_size = 16) +
scale_y_continuous(labels = scales::label_percent(),n.breaks = 6) +
# scale_fill_brewer(palette = 'Set1',name = '',label = c('Frame 0','Frame 1','Frame 2')) +
scale_fill_npg(name = '',label = c('Frame 0','Frame 1','Frame 2')) +
theme(axis.ticks.length = unit(0.25,'cm'),
aspect.ratio = 0.6,
panel.grid = element_blank(),
axis.title = element_text(size = 18)) +
xlab('Footprint length (nt)') + ylab('Numbers of Reads') +
facet_wrap(~type,scales = 'free',ncol = 4)
plengthframe
read density on transcript
# 提取transcript length位置
all_df$len = sapply(str_split(as.character(all_df$rname),'_'),'[',6) %>% as.numeric()
# assign region
all_df$rel <- case_when(
all_df$pos < all_df$st_pos ~ abs(all_df$pos/(all_df$st_pos-1)),
all_df$pos >= all_df$st_pos & all_df$pos < all_df$sp_pos + 2 ~ (all_df$pos - all_df$st_pos)/(all_df$sp_pos +2 - all_df$st_pos)+1,
all_df$pos >= all_df$sp_pos + 2 ~ (all_df$pos - all_df$sp_pos -2)/(all_df$len - all_df$sp_pos - 2)+2
)
# to data frame
rel2cds <- data.frame(rel = all_df$rel,type = all_df$type)
# plot
ggplot(rel2cds,aes(x = rel)) +
geom_density(bw = 0.00001,size = 1,color = '#00AFC1') +
scale_x_continuous(breaks = c(0.5,1.5,2.5),labels = c("5'UTR","CDS","3'UTR")) +
theme_bw(base_size = 16) +
theme(aspect.ratio = 0.5) +
geom_vline(xintercept = c(1,2),lty = 'dashed',size = 1) +
facet_wrap(~type,scales = 'free',ncol = 2) +
xlab('Ribo density on transcript')
3结尾
bam 文件如果较大, 批量读入进来也会花费大量时间, R 特别耗内存,在内存较大的电脑上跑会快一些。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
哦,数据代码已上传至QQ群,欢迎加入下载。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀pysam 读取 bam 文件准备 Ribo-seq 质控数据
◀跟着 Cell Reports 学做图-CLIP-seq 数据可视化
◀m6A enrichment on peak center
◀m6A metagene distribution 纠正坐标
◀...